Descriptive statistics

Matrix plot of platform data (year, skill ratings, etc.) and demographic data (age, gender, etc.) with correlation coefficients.

Entry

Create room-level data

add.covars <- function(x, logs = FALSE) {
    covars <- with(races, data.frame(year = 2015 - year, rating, 
        nreg, nsub, algo_rating = algo_rating/100, algo_nreg, 
        algo_nsub, paid, nwins = nwins > 0, ntop5 = ntop5 > 0, 
        ntop10 = ntop10 > 0, risk, hours, timezone.dist = abs(timezone + 
            5), male, postgrad = ifelse(educ == "Postgraduate (MA)" | 
            educ == "Phd", 1, 0), below30 = ifelse(age == "<20" | 
            age == "20-25" | age == "26-30", 1, 0)))
    controls <- aggregate(covars, by = with(races, list(room_id = room_id)), 
        mean, na.rm = TRUE)
    out <- merge(x, controls)
    out$lpaid <- log(out$paid)  # Use logs for paid
    out$paid <- NULL
    if (logs) {
        out$lpaid <- log(out$paid)
        out$lhours <- log(out$hours)
        out$lnreg <- log(out$nreg)
    }
    out$nwins <- out$nwins > 0
    return(out)
}
races$n <- ave(races$submit, races$room_id, FUN = length)
entry <- add.covars(aggregate(submit ~ n + room_id + treatment + 
    room_size, data = races, sum))
summary(entry)
##     room_id         n              treatment room_size      submit     
##  race 1 : 1   Min.   : 9.00   race      :8   Large:12   Min.   :1.000  
##  race 2 : 1   1st Qu.:10.00   tournament:8   Small:12   1st Qu.:2.750  
##  race 3 : 1   Median :12.50   reserve   :8              Median :4.000  
##  race 4 : 1   Mean   :12.46                             Mean   :3.583  
##  race 5 : 1   3rd Qu.:15.00                             3rd Qu.:5.000  
##  race 6 : 1   Max.   :15.00                             Max.   :6.000  
##  (Other):18                                                            
##       year           rating           nreg            nsub       
##  Min.   :3.000   Min.   :11.37   Min.   : 3.70   Min.   : 1.300  
##  1st Qu.:4.350   1st Qu.:12.43   1st Qu.:12.00   1st Qu.: 4.675  
##  Median :4.950   Median :13.17   Median :16.97   Median : 7.350  
##  Mean   :5.037   Mean   :13.37   Mean   :17.21   Mean   : 7.036  
##  3rd Qu.:5.633   3rd Qu.:14.37   3rd Qu.:22.65   3rd Qu.: 9.033  
##  Max.   :7.200   Max.   :15.93   Max.   :28.53   Max.   :12.800  
##                                                                  
##   algo_rating       algo_nreg       algo_nsub       nwins        
##  Min.   : 7.045   Min.   :16.00   Min.   :14.10   Mode :logical  
##  1st Qu.: 9.296   1st Qu.:36.38   1st Qu.:31.82   FALSE:9        
##  Median :10.780   Median :45.45   Median :39.75   TRUE :15       
##  Mean   :10.547   Mean   :46.16   Mean   :40.96   NA's :0        
##  3rd Qu.:11.544   3rd Qu.:59.33   3rd Qu.:54.09                  
##  Max.   :12.991   Max.   :73.00   Max.   :65.70                  
##                                                                  
##      ntop5            ntop10            risk           hours      
##  Min.   :0.0000   Min.   :0.0000   Min.   :5.357   Min.   :18.29  
##  1st Qu.:0.1833   1st Qu.:0.2000   1st Qu.:6.178   1st Qu.:26.05  
##  Median :0.2667   Median :0.3000   Median :6.333   Median :30.07  
##  Mean   :0.2222   Mean   :0.3046   Mean   :6.445   Mean   :31.47  
##  3rd Qu.:0.3083   3rd Qu.:0.4000   3rd Qu.:6.851   3rd Qu.:37.78  
##  Max.   :0.4000   Max.   :0.5333   Max.   :7.625   Max.   :45.13  
##                                                                   
##  timezone.dist         male           postgrad         below30      
##  Min.   : 5.125   Min.   :0.7500   Min.   :0.1429   Min.   :0.4000  
##  1st Qu.: 6.475   1st Qu.:0.9000   1st Qu.:0.2500   1st Qu.:0.5714  
##  Median : 7.537   Median :1.0000   Median :0.4000   Median :0.6333  
##  Mean   : 7.653   Mean   :0.9482   Mean   :0.3607   Mean   :0.6787  
##  3rd Qu.: 8.710   3rd Qu.:1.0000   3rd Qu.:0.4381   3rd Qu.:0.8115  
##  Max.   :10.556   Max.   :1.0000   Max.   :0.5556   Max.   :0.9000  
##                                                                     
##      lpaid       
##  Min.   : 8.052  
##  1st Qu.: 9.143  
##  Median :10.009  
##  Mean   :10.134  
##  3rd Qu.:11.090  
##  Max.   :12.293  
## 

Linear regression

entry.lm <- glm(log(submit/n) ~ treatment + room_size, data = entry)
summary(entry.lm)
## 
## Call:
## glm(formula = log(submit/n) ~ treatment + room_size, data = entry)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.83222  -0.24266  -0.01059   0.26639   0.80156  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.37050    0.16875  -8.122 9.22e-08 ***
## treatmenttournament  0.30255    0.20667   1.464    0.159    
## treatmentreserve     0.02434    0.20667   0.118    0.907    
## room_sizeSmall      -0.12421    0.16875  -0.736    0.470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.170851)
## 
##     Null deviance: 3.9617  on 23  degrees of freedom
## Residual deviance: 3.4170  on 20  degrees of freedom
## AIC: 31.326
## 
## Number of Fisher Scoring iterations: 2

Linear regression diagnostics

layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(entry.lm)

Linear regression with all controls

entry.lm.all <- glm(log(submit/n) ~ ., data = entry[, -1])
summary(entry.lm.all)
## 
## Call:
## glm(formula = log(submit/n) ~ ., data = entry[, -1])
## 
## Deviance Residuals: 
##         1          2          3          4          5          6  
##  0.045293  -0.029772  -0.004308   0.056464   0.052742  -0.017453  
##         7          8          9         10         11         12  
## -0.089327  -0.013639  -0.035707   0.022246  -0.039607  -0.027303  
##        13         14         15         16         17         18  
##  0.051768   0.057219   0.007678  -0.036293  -0.014274  -0.000146  
##        19         20         21         22         23         24  
##  0.050537  -0.023424   0.056149  -0.007597  -0.017957  -0.043289  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         -8.72554    4.72104  -1.848   0.1617  
## treatmenttournament  0.58895    0.15719   3.747   0.0332 *
## treatmentreserve     0.11446    0.16704   0.685   0.5424  
## room_sizeSmall      -0.20902    0.15746  -1.327   0.2764  
## year                 0.63942    0.23953   2.670   0.0757 .
## rating               0.33075    0.09302   3.556   0.0379 *
## nreg                -0.14251    0.07454  -1.912   0.1518  
## nsub                 0.31100    0.18484   1.683   0.1911  
## algo_rating         -0.12715    0.04227  -3.008   0.0573 .
## algo_nreg            0.10868    0.02399   4.530   0.0201 *
## algo_nsub           -0.11131    0.02734  -4.072   0.0267 *
## nwinsTRUE           -0.64568    0.19798  -3.261   0.0471 *
## ntop5               -1.25284    1.06177  -1.180   0.3231  
## ntop10               0.97479    0.90602   1.076   0.3608  
## risk                -0.85303    0.16385  -5.206   0.0138 *
## hours               -0.01399    0.01564  -0.895   0.4369  
## timezone.dist       -0.05276    0.04517  -1.168   0.3272  
## male                 4.61821    2.34450   1.970   0.1435  
## postgrad            -2.66143    1.23093  -2.162   0.1193  
## below30              4.61756    1.74298   2.649   0.0770 .
## lpaid                0.09389    0.12870   0.730   0.5185  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.01250787)
## 
##     Null deviance: 3.961652  on 23  degrees of freedom
## Residual deviance: 0.037524  on  3  degrees of freedom
## AIC: -42.951
## 
## Number of Fisher Scoring iterations: 2

Linear regression with all controls – diagnostics

layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(entry.lm.all)

Linear regression with some of the controls

entry.lm.some <- update(entry.lm, "~ . + rating+nreg+nwins+hours+below30+risk+year+timezone.dist")
summary(entry.lm.some)
## 
## Call:
## glm(formula = log(submit/n) ~ treatment + room_size + rating + 
##     nreg + nwins + hours + below30 + risk + year + timezone.dist, 
##     data = entry)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.36135  -0.13818   0.00601   0.13666   0.36092  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         -3.629433   1.683283  -2.156  0.05207 . 
## treatmenttournament  0.451951   0.145178   3.113  0.00897 **
## treatmentreserve     0.254329   0.219926   1.156  0.27001   
## room_sizeSmall      -0.124668   0.136684  -0.912  0.37968   
## rating               0.136975   0.054946   2.493  0.02828 * 
## nreg                 0.015475   0.020545   0.753  0.46584   
## nwinsTRUE           -0.566535   0.182752  -3.100  0.00919 **
## hours                0.011408   0.009418   1.211  0.24909   
## below30              2.081632   0.701793   2.966  0.01178 * 
## risk                -0.467709   0.118771  -3.938  0.00197 **
## year                 0.184912   0.136793   1.352  0.20138   
## timezone.dist        0.091483   0.043232   2.116  0.05592 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07534486)
## 
##     Null deviance: 3.96165  on 23  degrees of freedom
## Residual deviance: 0.90414  on 12  degrees of freedom
## AIC: 15.417
## 
## Number of Fisher Scoring iterations: 2

Linear regression with some of the controls – diagnostics

layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(entry.lm.some)

Data-driven model selection

“If we do go ahead and use the same data twice, once to pick a model and once to test hypotheses about that model, we will get confidence intervals which are systematically too narrow, p-values which are systematically too small, etc.” – Shalizi (2015)

Step-wise model selection on half sample

i <- sample(1:nrow(entry), size = nrow(entry)/2)
j <- c("submit", "n", "treatment", "room_size", "year", "rating", 
    "nreg", "nwins", "risk", "hours", "below30")
entry.lm.step.half <- step(update(entry.lm.all, data = entry[, 
    j], subset = i))
## Start:  AIC=-3.54
## log(submit/n) ~ treatment + room_size + year + rating + nreg + 
##     nwins + risk + hours + below30
## 
##             Df Deviance     AIC
## - nwins      1 0.070820 -5.5356
## - hours      1 0.071134 -5.4827
## - room_size  1 0.071315 -5.4521
## <none>         0.070789 -3.5410
## - rating     1 0.102555 -1.0927
## - below30    1 0.106360 -0.6555
## - nreg       1 0.196700  6.7227
## - year       1 0.197366  6.7633
## - treatment  2 0.289514  9.3610
## - risk       1 0.310764 12.2110
## 
## Step:  AIC=-5.54
## log(submit/n) ~ treatment + room_size + year + rating + nreg + 
##     risk + hours + below30
## 
##             Df Deviance     AIC
## - hours      1  0.07113 -7.4825
## - room_size  1  0.07167 -7.3930
## <none>          0.07082 -5.5356
## - rating     1  0.10365 -2.9651
## - below30    1  0.12787 -0.4451
## - year       1  0.21538  5.8116
## - nreg       1  0.36103 12.0101
## - treatment  2  0.44172 12.4308
## - risk       1  0.60518 18.2089
## 
## Step:  AIC=-7.48
## log(submit/n) ~ treatment + room_size + year + rating + nreg + 
##     risk + below30
## 
##             Df Deviance     AIC
## - room_size  1  0.07167 -9.3924
## <none>          0.07113 -7.4825
## - rating     1  0.10744 -4.5338
## - below30    1  0.16161  0.3652
## - year       1  0.23773  4.9962
## - nreg       1  0.36497 10.1404
## - treatment  2  0.49325 11.7548
## - risk       1  0.62918 16.6755
## 
## Step:  AIC=-9.39
## log(submit/n) ~ treatment + year + rating + nreg + risk + below30
## 
##             Df Deviance     AIC
## <none>          0.07167 -9.3924
## - rating     1  0.11893 -5.3151
## - below30    1  0.19228  0.4499
## - year       1  0.37856  8.5790
## - nreg       1  0.56673 13.4212
## - treatment  2  0.97972 17.9898
## - risk       1  1.03529 20.6518
summary(entry.lm.step.half)
## 
## Call:
## glm(formula = log(submit/n) ~ treatment + year + rating + nreg + 
##     risk + below30, data = entry[, j], subset = i)
## 
## Deviance Residuals: 
##         9          5         13         23          6         24  
## -0.083924  -0.108213   0.041056  -0.020625   0.083978   0.124433  
##         8         22          2          7          4         11  
## -0.084223  -0.103808   0.028811   0.087581  -0.007934   0.042868  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          4.27446    1.40170   3.049  0.03805 * 
## treatmenttournament  0.55359    0.14011   3.951  0.01680 * 
## treatmentreserve     1.69520    0.25943   6.534  0.00283 **
## year                 0.40254    0.09727   4.139  0.01439 * 
## rating               0.10046    0.06186   1.624  0.17969   
## nreg                -0.08935    0.01700  -5.256  0.00627 **
## risk                -1.03669    0.14136  -7.333  0.00184 **
## below30             -1.98306    0.76435  -2.594  0.06040 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.01791772)
## 
##     Null deviance: 1.878654  on 11  degrees of freedom
## Residual deviance: 0.071671  on  4  degrees of freedom
## AIC: -9.3924
## 
## Number of Fisher Scoring iterations: 2

Inference on other half sample

model <- formula(entry.lm.step.half)
entry.lm <- lm(model, data = entry, subset = setdiff(1:nrow(entry), 
    i))
summary(entry.lm)
## 
## Call:
## lm(formula = model, data = entry, subset = setdiff(1:nrow(entry), 
##     i))
## 
## Residuals:
##        1        3       10       12       14       15       16       17 
##  0.01272 -0.01272  0.25782 -0.09511  0.34791 -0.46651 -0.04411 -0.10343 
##       18       19       20       21 
##  0.09757  0.17755 -0.33359  0.16191 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)         -1.45710    3.30825  -0.440    0.682
## treatmenttournament -0.07487    0.39531  -0.189    0.859
## treatmentreserve    -0.42106    0.43889  -0.959    0.392
## year                -0.03736    0.28372  -0.132    0.902
## rating               0.07786    0.09276   0.839    0.448
## nreg                 0.03515    0.03560   0.987    0.379
## risk                -0.35098    0.20901  -1.679    0.168
## below30              1.79145    1.52905   1.172    0.306
## 
## Residual standard error: 0.3891 on 4 degrees of freedom
## Multiple R-squared:  0.582,  Adjusted R-squared:  -0.1495 
## F-statistic: 0.7956 on 7 and 4 DF,  p-value: 0.6296

Robust Linear regression with all controls – failed

require(MASS)
entry.rlm <- rlm(log(submit/n) ~ ., data = entry[, -1], maxit = 50)
summary(entry.rlm)
## 
## Call: rlm(formula = log(submit/n) ~ ., data = entry[, -1], maxit = 50)
## Residuals:
##          1          2          3          4          5          6 
##  2.018e-05 -1.085e-05 -5.233e-06  2.170e-05  3.835e-02 -1.765e-05 
##          7          8          9         10         11         12 
## -3.617e-01 -8.155e-06 -1.857e-05 -1.938e-06 -6.403e-06 -9.972e-06 
##         13         14         15         16         17         18 
##  2.456e-05  1.616e-05  3.041e-06 -6.878e-06 -4.001e-06 -5.766e-06 
##         19         20         21         22         23         24 
##  2.236e-05 -1.513e-06  5.671e-02  8.271e-07 -4.571e-05 -2.038e-05 
## 
## Coefficients:
##                     Value      Std. Error t value   
## (Intercept)           -12.8115     0.0032 -3965.2505
## treatmenttournament     0.6989     0.0001  6496.3384
## treatmentreserve        0.1529     0.0001  1337.6861
## room_sizeSmall         -0.3272     0.0001 -3035.9294
## year                    0.8194     0.0002  4998.7459
## rating                  0.4020     0.0001  6314.0938
## nreg                   -0.1933     0.0001 -3788.3626
## nsub                    0.4586     0.0001  3625.4356
## algo_rating            -0.0994     0.0000 -3434.8618
## algo_nreg               0.1429     0.0000  8701.8337
## algo_nsub              -0.1502     0.0000 -8026.5545
## nwinsTRUE              -0.8281     0.0001 -6112.0504
## ntop5                  -1.3229     0.0007 -1820.6254
## ntop10                 -0.1504     0.0006  -242.5310
## risk                   -0.9730     0.0001 -8677.0773
## hours                  -0.0184     0.0000 -1716.8155
## timezone.dist          -0.0374     0.0000 -1208.6091
## male                    6.0207     0.0016  3752.3557
## postgrad               -3.0690     0.0008 -3643.1237
## below30                 5.9835     0.0012  5016.1328
## lpaid                   0.1885     0.0001  2139.9692
## 
## Residual standard error: 2.615e-05 on 3 degrees of freedom

Robust Linear regression with some of the controls

require(MASS)
entry.rlm <- rlm(log(submit/n) ~ treatment + room_size + rating + 
    nreg + nwins + hours + below30 + risk + year + timezone.dist, 
    data = entry[, -1], maxit = 50)
summary(entry.rlm)
## 
## Call: rlm(formula = log(submit/n) ~ treatment + room_size + rating + 
##     nreg + nwins + hours + below30 + risk + year + timezone.dist, 
##     data = entry[, -1], maxit = 50)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.411982 -0.111404  0.007527  0.136716  0.393944 
## 
## Coefficients:
##                     Value   Std. Error t value
## (Intercept)         -3.7919  1.7145    -2.2117
## treatmenttournament  0.4592  0.1479     3.1054
## treatmentreserve     0.2790  0.2240     1.2456
## room_sizeSmall      -0.1611  0.1392    -1.1568
## rating               0.1343  0.0560     2.4000
## nreg                 0.0167  0.0209     0.7978
## nwinsTRUE           -0.5956  0.1861    -3.1999
## hours                0.0126  0.0096     1.3127
## below30              2.2232  0.7148     3.1102
## risk                -0.4800  0.1210    -3.9678
## year                 0.2180  0.1393     1.5645
## timezone.dist        0.0894  0.0440     2.0295
## 
## Residual standard error: 0.1828 on 12 degrees of freedom

PCA to reduce dimensionality of controls

# Principal component
pca <- prcomp(entry[, -c(1:5)], center = TRUE, scale = TRUE)
# summary(pca)
par(mfrow = c(1, 2))
plot(pca)
biplot(pca)

# Predict
pca.hat <- predict(pca)
entry$pca1 <- pca.hat[, 1]
entry$pca2 <- pca.hat[, 2]
entry$pca3 <- pca.hat[, 3]
entry$pca4 <- pca.hat[, 4]
entry$pca5 <- pca.hat[, 5]  # 74 % of variance

Linear regression with PCA controls

m <- rep()
m$m1 <- update(entry.lm, "~.+pca1")
m$m2 <- update(entry.lm, "~.+pca1+pca2")
m$m3 <- update(entry.lm, "~.+pca1+pca2+pca3")
m$m4 <- update(entry.lm, "~.+pca1+pca2+pca3+pca4")
m$m5 <- update(entry.lm, "~.+pca1+pca2+pca3+pca4+pca5")
stargazer(entry.lm, m, type = "text", digits = 2)
## 
## ====================================================================================================
##                                                   Dependent variable:                               
##                     --------------------------------------------------------------------------------
##                                                      log(submit/n)                                  
##                           (1)              (2)              (3)               (4)         (5)   (6) 
## ----------------------------------------------------------------------------------------------------
## treatmenttournament      -0.07            -0.12            -0.08             -0.08       0.66  0.66 
##                          (0.40)           (0.49)           (0.62)           (0.87)                  
##                                                                                                     
## treatmentreserve         -0.42            -0.53            -0.38             -0.47       -7.25 -7.25
##                          (0.44)           (0.69)           (1.08)           (1.82)                  
##                                                                                                     
## year                     -0.04            -0.13             0.03             0.03        0.36  0.36 
##                          (0.28)           (0.52)           (0.95)           (1.33)                  
##                                                                                                     
## rating                    0.08             0.06             0.12             0.17        0.85  0.85 
##                          (0.09)           (0.15)           (0.35)           (0.73)                  
##                                                                                                     
## nreg                      0.04             0.04             0.03             0.02        1.37  1.37 
##                          (0.04)           (0.04)           (0.07)           (0.13)                  
##                                                                                                     
## risk                     -0.35            -0.35            -0.53             -0.53       -5.76 -5.76
##                          (0.21)           (0.24)           (0.89)           (1.25)                  
##                                                                                                     
## below30                   1.79             1.95             1.54             1.71        -1.12 -1.12
##                          (1.53)           (1.89)           (2.92)           (4.51)                  
##                                                                                                     
## pca1                                       0.06            -0.01             0.06        -2.35 -2.35
##                                           (0.26)           (0.45)           (1.01)                  
##                                                                                                     
## pca2                                                        0.09             0.13        2.67  2.67 
##                                                            (0.38)           (0.74)                  
##                                                                                                     
## pca3                                                                         0.09        -1.02 -1.02
##                                                                             (0.95)                  
##                                                                                                     
## pca4                                                                                     -5.37 -5.37
##                                                                                                     
##                                                                                                     
## pca5                                                                                                
##                                                                                                     
##                                                                                                     
## Constant                 -1.46            -0.81            -0.97             -1.62       1.07  1.07 
##                          (3.31)           (4.80)           (5.86)           (10.79)                 
##                                                                                                     
## ----------------------------------------------------------------------------------------------------
## Observations               12               12               12               12          12    12  
## R2                        0.58             0.59             0.60             0.60        1.00  1.00 
## Adjusted R2              -0.15            -0.51            -1.21             -3.38                  
## Residual Std. Error  0.39 (df = 4)    0.45 (df = 3)    0.54 (df = 2)     0.76 (df = 1)              
## F Statistic         0.80 (df = 7; 4) 0.54 (df = 8; 3) 0.33 (df = 9; 2) 0.15 (df = 10; 1)            
## ====================================================================================================
## Note:                                                                    *p<0.1; **p<0.05; ***p<0.01

Other differences in entry

We start with differences in the empirical proportions.

entry <- xtabs(submit ~ treatment + room_size, data = aggregate(submit ~ 
    treatment + room_size, races, mean))
knitr::kable(round(100 * entry, 1), caption = "Participation rates by competition style and room size")
Participation rates by competition style and room size
Large Small
race 26.7 25.6
tournament 30.0 37.5
reserve 30.0 22.5

Regression models

We first study the impact of competition style on entry.

Sorting

We regress all variables against treatment and room size dummies.

rating.lm <- lm(rating ~ submit * treatment + room_size, data = races)
rating.sum <- summary(rating.lm)
extract_info <- function(object) {
    est <- round(object$coef[, 1], 2)
    se <- paste("(", round(object$coef[, 2], 2), ")", sep = "")
    vec <- as.vector(rbind(est, se))
    vec.info <- rep(c("Estimates", "St.err"), length(est))
    vec.names <- rep(names(est), each = 2)
    data.frame(names = vec.names, info = vec.info, values = vec)
}
merge_info <- function(x, y) merge(x, y, by = c("names", "info"), 
    all = TRUE)
regmat <- function(x) {
    models_l <- lapply(x, function(x) extract_info(summary(x)))
    Reduce(merge_info, models_l)
}

summary(lm(rating ~ submit * treatment + room_size, data = races))
## 
## Call:
## lm(formula = rating ~ submit * treatment + room_size, data = races)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9261 -2.8764 -0.4474  2.2320 15.4873 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     12.3809     0.6464  19.155   <2e-16 ***
## submitTRUE                       2.1252     1.0574   2.010   0.0458 *  
## treatmenttournament             -0.4536     0.8666  -0.523   0.6013    
## treatmentreserve                 0.5097     0.8775   0.581   0.5620    
## room_sizeSmall                   0.3066     0.5865   0.523   0.6017    
## submitTRUE:treatmenttournament   1.0649     1.4636   0.728   0.4677    
## submitTRUE:treatmentreserve     -1.6246     1.4876  -1.092   0.2761    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.155 on 203 degrees of freedom
##   (89 observations deleted due to missingness)
## Multiple R-squared:  0.06715,    Adjusted R-squared:  0.03957 
## F-statistic: 2.435 on 6 and 203 DF,  p-value: 0.02699
coefplot <- function(object, ...) {
    object.sum <- summary(object)
    est <- object.sum$coef[-1, 1]
    se <- object.sum$coef[-1, 2]
    xlim <- range(c(est + 2 * se, est - 2 * se))
    plot(y = 1:length(est), x = est, xlim = xlim, ...)
    segments(x0 = est + se, x1 = est - se, y0 = 1:length(est), 
        lwd = 2)
    segments(x0 = est + 2 * se, x1 = est - 2 * se, y0 = 1:length(est))
}
rating.lm <- lm(rating ~ submit * I(treatment == "race") + room_size, 
    data = races)
vars <- c("year", "rating", "nreg", "nsub", "nwins", "ntop10", 
    "hours", "timezone")
par(mfrow = c(3, 3))
models <- lapply(vars, function(x) {
    coefplot(update(rating.lm, paste(x, "~.")), pch = 16, bty = "n", 
        yaxt = "n")
    abline(v = 0, lty = 3)
    title(x)
})
m <- regmat(models)
## Error in object$coef: $ operator is invalid for atomic vectors
mt <- t(m)
rownames(mt) <- c("names", "info", vars)
## Error in `rownames<-`(`*tmp*`, value = c("names", "info", "year", "rating", : length of 'dimnames' [1] not equal to array extent
mt[-c(1, 2), ]
##      m1 m2 m3 m4 m5

We assume the conditional mean of the proportion of entrants \(Y_{ij}=E_{ij}/N_{j}\) observed in a room randomly assigned to the \(i\)th treatment and of room size \(N_j\) is \[ E[Y_{ij}=y \mid x] = \text{competition}_{i} + \text{size}_{j} \]

The model is fully specified with \(var(Y\mid x)\) assumed normal and costant.

races$n <- ave(races$submit, races$room_id, FUN = length)
rooms <- aggregate(submit ~ room + room_size + n + treatment, 
    data = races, sum)

rooms$y <- rooms$submit/rooms$n
rooms.lm <- lm(y ~ treatment + room_size, data = rooms)
# rooms.lm <- lm(log(y) ~ treatment+room_size, data=rooms)
summary(rooms.lm)
## 
## Call:
## lm(formula = y ~ treatment + room_size, data = rooms)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.160648 -0.063310 -0.006019  0.046759  0.240741 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.262963   0.044849   5.863 9.79e-06 ***
## treatmenttournament  0.076389   0.054928   1.391    0.180    
## treatmentreserve     0.001389   0.054928   0.025    0.980    
## room_sizeSmall      -0.003704   0.044849  -0.083    0.935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1099 on 20 degrees of freedom
## Multiple R-squared:  0.1127, Adjusted R-squared:  -0.02043 
## F-statistic: 0.8465 on 3 and 20 DF,  p-value: 0.4846

This model shows that the difference is significant (one-sided) but the model is not significant F-test. We try with the logarithm.

rooms.lm.log <- lm(log(y) ~ treatment + room_size, data = rooms)
summary(rooms.lm.log)
## 
## Call:
## lm(formula = log(y) ~ treatment + room_size, data = rooms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83222 -0.24266 -0.01059  0.26639  0.80156 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.37050    0.16875  -8.122 9.22e-08 ***
## treatmenttournament  0.30255    0.20667   1.464    0.159    
## treatmentreserve     0.02434    0.20667   0.118    0.907    
## room_sizeSmall      -0.12421    0.16875  -0.736    0.470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4133 on 20 degrees of freedom
## Multiple R-squared:  0.1375, Adjusted R-squared:  0.008097 
## F-statistic: 1.063 on 3 and 20 DF,  p-value: 0.3871

Robust linear regression model.

library(MASS)
rooms.rlm <- rlm(y ~ treatment + room_size, data = rooms)
mat <- coef(summary(rooms.rlm))
round(cbind(mat, pval = 2 * (1 - pt(abs(mat[, 3]), df = 20))), 
    3)
##                      Value Std. Error t value pval
## (Intercept)          0.256      0.044   5.790 0.00
## treatmenttournament  0.072      0.054   1.327 0.20
## treatmentreserve     0.023      0.054   0.418 0.68
## room_sizeSmall      -0.025      0.044  -0.563 0.58

Model checking. Residuals do not look normal.

res <- resid(rooms.lm)  ## Raw residuals
lev <- hatvalues(rooms.lm)  ## Leverages
res.mo <- res/sqrt(1 - lev)  ## Modified residuals

# Diagnostic plots
par(mfrow = c(1, 2))
qqnorm(res.mo, pch = 16, xlab = "Quantiles Standard Normal", 
    ylab = "Modified residuals", main = "")
qqline(res.mo, pch = 16)
plot(lev, res.mo, pch = 16, xlab = "Leverage h", ylab = "Modified residuals")

After resampling errors, we find only small differences between bootstrapped and asymptotic estimates of coefficients’ standard errors.

rooms$res <- resid(rooms.lm)
rooms$fitted <- predict(rooms.lm)
f <- function(data, i, object) {
    d <- data
    d$y <- d$fitted + d$res[i]
    lm.res <- update(object, data = d)
    c(coef(lm.res), sigma(lm.res))
}
rooms.boot <- boot(f, R = 499, data = rooms, object = rooms.lm)
plot(rooms.boot, index = 2)

SE <- apply(rooms.boot$t, 2, sd)  # St. errors compared to 0.096 and 0.0285
round(cbind(boot = SE, approx = c(coef(summary(rooms.lm))[, 2], 
    err = sigma(rooms.lm))), 3)
##                      boot approx
## (Intercept)         0.039  0.045
## treatmenttournament 0.051  0.055
## treatmentreserve    0.051  0.055
## room_sizeSmall      0.041  0.045
## err                 0.015  0.110

We check after resampling cases

f <- function(data, i, object) {
    d <- data[i, ]
    lm.res <- update(object, data = d)
    c(coef(lm.res), sigma(lm.res))
}
rooms.boot <- boot(f, R = 499, data = rooms, object = rooms.lm)
plot(rooms.boot, index = 2)

SE <- apply(rooms.boot$t, 2, sd)  # St. errors compared to 0.096 and 0.0285
round(cbind(boot = SE, approx = c(coef(summary(rooms.lm))[, 2], 
    err = sigma(rooms.lm))), 3)
##                      boot approx
## (Intercept)         0.038  0.045
## treatmenttournament 0.056  0.055
## treatmentreserve    0.051  0.055
## room_sizeSmall      0.044  0.045
## err                 0.017  0.110

Another simple model is that the number of entrants \(e_{i}\) for the ith competition style is a binomial random variable with probability \(\pi_{i}\) and size \(n_{i}\). The responses are taken \(y_i = e_i / n_i\) so the mean response is equal to the probability that a competitor enters.

rooms.glm <- glm(cbind(submit, n) ~ treatment + room_size, binomial(logit), 
    data = rooms)
summary(rooms.glm)
## 
## Call:
## glm(formula = cbind(submit, n) ~ treatment + room_size, family = binomial(logit), 
##     data = rooms)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.07182  -0.39862  -0.01323   0.22723   1.14351  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.33075    0.24103  -5.521 3.37e-08 ***
## treatmenttournament  0.22871    0.29815   0.767    0.443    
## treatmentreserve     0.02759    0.30920   0.089    0.929    
## room_sizeSmall      -0.01605    0.25052  -0.064    0.949    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7.9613  on 23  degrees of freedom
## Residual deviance: 7.2401  on 20  degrees of freedom
## AIC: 83.102
## 
## Number of Fisher Scoring iterations: 4

For an adequate fit, the deviance should be approximately distributed as Chisq with 23 degrees of freedom. A chisq test gives a pvalue of 0.9958201, indicating under-dispersion relative to the model. This can be due to heterogeneity of probability of entering which is not well accounted for in the aggregated model.

A quasi-binomial model seems to fit better; confirms the under-dispersion; and now the races vs tournament effect is almost significant (one-sided, 10 percent). Again, however, deviance appears very low relative to the theoretical model.

rooms.qb <- glm(cbind(submit, n) ~ treatment + room_size, quasibinomial(logit), 
    data = rooms)
summary(rooms.qb)  # races vs tournament is slightly nonsignificant (one-sided)
## 
## Call:
## glm(formula = cbind(submit, n) ~ treatment + room_size, family = quasibinomial(logit), 
##     data = rooms)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.07182  -0.39862  -0.01323   0.22723   1.14351  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.33075    0.14343  -9.278  1.1e-08 ***
## treatmenttournament  0.22871    0.17742   1.289    0.212    
## treatmentreserve     0.02759    0.18399   0.150    0.882    
## room_sizeSmall      -0.01605    0.14907  -0.108    0.915    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 0.3540818)
## 
##     Null deviance: 7.9613  on 23  degrees of freedom
## Residual deviance: 7.2401  on 20  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

Since the binomial model does not fit well the data, we try modeling entrants in a room as independent poisson variables with mean \(\mu_{ij}\) for the i-th competition style and k-th room size. That is: \[ \mu_{ik} = exp(\alpha_i + \beta_k) \] where \(\alpha_i\) is the competition style and \(\beta\) the roome size. The poisson model might be better than binomial because it allows individual probabilities of entry not be identical. [Need to very it]

The poisson model gives very similar results to the binomial model, including under-dispersion.

rooms.poi <- glm(submit ~ offset(log(n)) + treatment + room_size, 
    quasipoisson, data = rooms)
summary(rooms.poi)
## 
## Call:
## glm(formula = submit ~ offset(log(n)) + treatment + room_size, 
##     family = quasipoisson, data = rooms)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.17879  -0.45198  -0.01118   0.25473   1.31354  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.33240    0.14622  -9.112 1.47e-08 ***
## treatmenttournament  0.22843    0.17896   1.276    0.216    
## treatmentreserve     0.02776    0.18752   0.148    0.884    
## room_sizeSmall      -0.01178    0.15052  -0.078    0.938    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.4657543)
## 
##     Null deviance: 10.1993  on 23  degrees of freedom
## Residual deviance:  9.2688  on 20  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

Another way to model under-dispersed data is with the beta-binomial model (following Davison’s book ch. 10). In this case, the probability of entry \(\pi\) has the beta distribution with parameters \(a\) and \(b\). Hence.

The model is \[ E[y] = m \mu,\quad Var(R) = m \mu (1-\mu) + m(m-1)\sigma^2 \] where \(\mu\) and \(\sigma^2\) denote mean and variance of \(\pi\) (according to beta-binomial these are \(a / (a+b)\) and ab/{(a+b)(a+b+1)}).

A binomial model with individual data should give very similar results. Let’s check this intuition

# With individual data
races.glm <- glm(submit ~ treatment + room_size, quasibinomial(logit), 
    data = races)
summary(races.glm)
## 
## Call:
## glm(formula = submit ~ treatment + room_size, family = quasibinomial(logit), 
##     data = races)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8974  -0.7956  -0.7828   1.4861   1.6398  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.02584    0.25207  -4.070 6.05e-05 ***
## treatmenttournament  0.32428    0.31418   1.032    0.303    
## treatmentreserve     0.03784    0.32294   0.117    0.907    
## room_sizeSmall      -0.01660    0.26354  -0.063    0.950    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.013501)
## 
##     Null deviance: 358.81  on 298  degrees of freedom
## Residual deviance: 357.49  on 295  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

There’s over-dispersion relative to the model, as a chisq test gives a pvalue of 1 - pchisq(races.glm$deviance, races.glm$df.residual). So we fit quasi-binomial model to account for overdispersion.

races.qb <- glm(submit ~ treatment+room_size, quasibinomial(logit), data=races)
summary(races.qb) 

The estimated effect of races vs tournament on the probability of entry is larger but nonsignificant.

We check different link functions like probit and clolog that give very similar results.

summary(races.probit <- glm(submit ~ treatment + room_size, quasibinomial(probit), 
    data = races))
## 
## Call:
## glm(formula = submit ~ treatment + room_size, family = quasibinomial(probit), 
##     data = races)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8979  -0.7960  -0.7832   1.4855   1.6407  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.63062    0.15003  -4.203 3.49e-05 ***
## treatmenttournament  0.19551    0.18904   1.034    0.302    
## treatmentreserve     0.02244    0.19216   0.117    0.907    
## room_sizeSmall      -0.01185    0.15823  -0.075    0.940    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.013503)
## 
##     Null deviance: 358.81  on 298  degrees of freedom
## Residual deviance: 357.49  on 295  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4
summary(races.cloglog <- glm(submit ~ treatment + room_size, 
    quasibinomial(cloglog), data = races))
## 
## Call:
## glm(formula = submit ~ treatment + room_size, family = quasibinomial(cloglog), 
##     data = races)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8966  -0.7949  -0.7821   1.4870   1.6383  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.18478    0.21663  -5.469 9.66e-08 ***
## treatmenttournament  0.27338    0.26536   1.030    0.304    
## treatmentreserve     0.03261    0.27772   0.117    0.907    
## room_sizeSmall      -0.00957    0.22307  -0.043    0.966    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.013514)
## 
##     Null deviance: 358.81  on 298  degrees of freedom
## Residual deviance: 357.49  on 295  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

It’s clear that the probability of entry is not constant among competitors but depends on ability. So we assume:

\[ \pi = \beta_{0i} + \beta_{1j} + \beta_{2} \text{skill rating}_i \]

Again we use quasibinomial to account for overdispersion.

races$rating.imp <- impute(races$rating, "zero")
races.glm <- glm(submit ~ treatment + room_size + log(nreg), 
    quasibinomial(logit), data = races)
summary(races.glm)
## 
## Call:
## glm(formula = submit ~ treatment + room_size + log(nreg), family = quasibinomial(logit), 
##     data = races)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3336  -0.8449  -0.6520   1.1478   2.1031  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.09547    0.36753  -5.701 2.89e-08 ***
## treatmenttournament  0.32962    0.32620   1.010    0.313    
## treatmentreserve    -0.04902    0.33492  -0.146    0.884    
## room_sizeSmall       0.06018    0.27408   0.220    0.826    
## log(nreg)            0.47244    0.10678   4.424 1.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.012135)
## 
##     Null deviance: 358.81  on 298  degrees of freedom
## Residual deviance: 335.79  on 294  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

Mmmhhh… perhaps I should look at number of submissions… instead of participation.

Submissions per competitor

subm <- aggregate(nsub ~ n + room_id + room_size + treatment, 
    data = races, sum)
par(mfrow = c(1, 2))
boxplot(nsub/n ~ room_size, subm, horizontal = TRUE)
boxplot(nsub/n ~ treatment, subm, horizontal = TRUE)

Testing differences between threshold (race & reserve) and tournament. Bootstrap

# OLS
subm.lm <- rlm(nsub/n ~ treatment + room_size, data = subm)
summary(subm.lm)
## 
## Call: rlm(formula = nsub/n ~ treatment + room_size, data = subm)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5.01364 -1.59091 -0.07557  2.00992  6.86249 
## 
## Coefficients:
##                     Value   Std. Error t value
## (Intercept)          6.7980  1.2891     5.2736
## treatmenttournament  1.0823  1.5788     0.6855
## treatmentreserve     1.6155  1.5788     1.0233
## room_sizeSmall      -1.6605  1.2891    -1.2881
## 
## Residual standard error: 2.81 on 20 degrees of freedom
# OLS with log
subm.lm.log <- glm(log(nsub/n) ~ treatment + room_size, data = subm)
summary(subm.lm.log)
## 
## Call:
## glm(formula = log(nsub/n) ~ treatment + room_size, data = subm)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.26396  -0.23919   0.06036   0.27676   0.95858  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.7585     0.2185   8.048 1.06e-07 ***
## treatmenttournament   0.2327     0.2676   0.870    0.395    
## treatmentreserve      0.3517     0.2676   1.314    0.204    
## room_sizeSmall       -0.2322     0.2185  -1.063    0.301    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2864534)
## 
##     Null deviance: 6.5646  on 23  degrees of freedom
## Residual deviance: 5.7291  on 20  degrees of freedom
## AIC: 43.729
## 
## Number of Fisher Scoring iterations: 2
# Poisson
subm.poi <- glm(nsub ~ log(offset(n)) + treatment + room_size, 
    poisson, data = subm)
summary(subm.poi)
## 
## Call:
## glm(formula = nsub ~ log(offset(n)) + treatment + room_size, 
##     family = poisson, data = subm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.7474  -2.3714   0.2507   2.5366   6.2816  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -14.659164   4.866811  -3.012  0.00259 ** 
## log(offset(n))        7.141301   1.799365   3.969 7.22e-05 ***
## treatmenttournament   0.001948   0.055424   0.035  0.97197    
## treatmentreserve      0.156722   0.053508   2.929  0.00340 ** 
## room_sizeSmall        2.369320   0.738567   3.208  0.00134 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 535.99  on 23  degrees of freedom
## Residual deviance: 332.97  on 19  degrees of freedom
## AIC: 491.26
## 
## Number of Fisher Scoring iterations: 4
subm.qpoi <- glm(nsub ~ log(offset(n)) + treatment + room_size, 
    quasipoisson, data = subm)
summary(subm.qpoi)
## 
## Call:
## glm(formula = nsub ~ log(offset(n)) + treatment + room_size, 
##     family = quasipoisson, data = subm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.7474  -2.3714   0.2507   2.5366   6.2816  
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)         -14.659164  19.642887  -0.746    0.465
## log(offset(n))        7.141301   7.262399   0.983    0.338
## treatmenttournament   0.001948   0.223698   0.009    0.993
## treatmentreserve      0.156722   0.215965   0.726    0.477
## room_sizeSmall        2.369320   2.980924   0.795    0.437
## 
## (Dispersion parameter for quasipoisson family taken to be 16.29002)
## 
##     Null deviance: 535.99  on 23  degrees of freedom
## Residual deviance: 332.97  on 19  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

When we turn to the individual propensity to submit, we strongly reject differences in the propensity to submit between large and small rooms. No evidence of competition styles effects.

# Submit and room size
tab.size <- with(races, table(submit, room_size))
fisher.test(tab.size)  # p = 0.98
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab.size
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.5689643 1.6912635
## sample estimates:
## odds ratio 
##  0.9846648
# Submit and competition styles
tab <- with(races, table(submit, treatment))
fisher.test(tab)  # p = 0.52
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.5255
## alternative hypothesis: two.sided
# Competition styles with target
tab <- with(races, table(submit, treatment == "tournament"))
fisher.test(tab, alternative = "greater")  # p=0.15
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.1558
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.8435475       Inf
## sample estimates:
## odds ratio 
##   1.355339

If we assume uniform priors, we can easily compute the posteriors in each treatment. (Not sure what we can do with this result.)

# Bayesian analysis Compute posteriors Beta(1 + success, 1 +
# n - success)
plot.posterior <- function(x, ...) {
    tab <- with(races, table(submit, treatment))
    param <- data.frame(tab + 1)
    index <- param$treatment == x
    shape <- param[index, ]
    curve(dbeta(x, shape$Freq[2], shape$Freq[1]), ...)
}

# PLOTs
plot.posterior("race", from = 0.1, to = 0.5, xlab = "propensity to submit", 
    ylab = "posterior density")
plot.posterior("tournament", add = TRUE, lty = 2)
plot.posterior("reserve", add = TRUE, lty = 3)
title("Beta(y + alpha, n-y + beta)")
legend("topright", legend = levels(races$treatment), lty = 1:3, 
    bty = "n")

Regression analysis of entry

Impute missing values at random for the survey. Ignore missing ratings (or impute them).

dat.imp <- within(dat, {
    set.seed(25978)
    rating.100 <- rating/100
    rated <- !is.na(rating)
    rating.100imp <- impute(rating/100, "zero")
    hours.imp <- impute(hours, "random")
    risk.imp <- impute(risk, "random")
    grad.imp <- impute(grad, "random")
    male.imp <- impute(male, "random")
    below30.imp <- impute(below30, "random")
    timezone.imp <- impute(timezone, "random")
    expert <- cut(submissions, quantile(submissions[!is.na(rating)]), 
        include = TRUE)
})
## Error in within(dat, {: object 'dat' not found

Fitting the model

# First model has no covariates (i.e., $\gamma=0$)
summary(m0 <- glm(submit ~ treatment, binomial(logit), data = dat.imp))
## Error in is.data.frame(data): object 'dat.imp' not found
# The second model adds the main skill rating measure which
# is available for 2/3 of our population. It seemed better to
# rescale rating in 100-point units and center it on the
# median value. Thus, the estimate of the intercept can be
# easily transformed in the probability of participation of
# the median rated individual assigned to a room with a race
# competition style.

summary(m1 <- update(m0, " ~ . + rating.100"))
## Error in update(m0, " ~ . + rating.100"): object 'm0' not found
summary(m1bis <- update(m0, " ~ . + rating.100imp + rated"))
## Error in update(m0, " ~ . + rating.100imp + rated"): object 'm0' not found
# The third column adds time availability (hours)
summary(m2 <- update(m1bis, " ~ . + hours.imp"))
## Error in update(m1bis, " ~ . + hours.imp"): object 'm1bis' not found
# ... add demographics to m2
summary(m3 <- update(m2, " ~ . + timezone.imp + grad.imp + below30.imp + male.imp"))
## Error in update(m2, " ~ . + timezone.imp + grad.imp + below30.imp + male.imp"): object 'm2' not found
# ... add risk aversion
summary(m4 <- update(m2, " ~ . + risk.imp"))
## Error in update(m2, " ~ . + risk.imp"): object 'm2' not found
# ... add everything stepwise regression
summary(m5 <- step(update(m3, " ~ . + risk.imp")))
## Error in update(m3, " ~ . + risk.imp"): object 'm3' not found
# Compare models
models <- list(m0, m1, m1bis, m2, m3, m4, m5)
## Error in eval(expr, envir, enclos): object 'm0' not found
stargazer(models, type = "text", digits = 2)
## Error in .summary.object$r.squared: $ operator is invalid for atomic vectors

Explore accuracy of predictions of the stepwise model

# Compare models in terms of prediction accuracy
accuracy <- function(fit) {
    yhat <- predict(fit, type = "response")
    tab <- table(predicted = ifelse(yhat > 0.5, 1, 0), actual = fit$y)
    tp <- tab[2, 2]
    fp <- tab[2, 1]
    fn <- tab[1, 2]
    precision <- tp/(tp + fp)
    recall <- tp/(tp + fn)
    fmeasure <- 2 * precision * recall/(precision + recall)
    list(conf.table = tab, precision = precision, recall = recall, 
        f.measure = fmeasure)
}
accuracy(m5)
## Error in predict(fit, type = "response"): object 'm5' not found
summarize.fit <- function(x) {
    yhat <- predict(x)
    with(x, plot(jitter(y) ~ yhat, col = ifelse(yhat > 0, "brown", 
        "blue"), pch = 16, xlab = "ability ~ Logistic"))
    curve(ilogit, add = T)
}
par(mfrow = c(3, 3))
sapply(models, summarize.fit)
## Error in UseMethod("predict"): no applicable method for 'predict' applied to an object of class "NULL"

Study interaction effecs first with models on a subset and then using interaction terms in the regression.

# Using Akaike criterion to select the best model, we now
# compute the model for each treatment alone
summary(race <- update(m5, "~ . -treatment", subset = treatment == 
    "race"))
## Error in update(m5, "~ . -treatment", subset = treatment == "race"): object 'm5' not found
summary(tour <- update(m5, "~ . -treatment", subset = treatment == 
    "tournament"))
## Error in update(m5, "~ . -treatment", subset = treatment == "tournament"): object 'm5' not found
summary(rese <- update(m5, "~ . -treatment", subset = treatment == 
    "reserve"))
## Error in update(m5, "~ . -treatment", subset = treatment == "reserve"): object 'm5' not found
# Interact on stepwise model
summary(interact <- update(m5, "~ (.)*treatment"))
## Error in update(m5, "~ (.)*treatment"): object 'm5' not found
summary(stepwise <- step(interact))
## Error in terms(object): object 'interact' not found
# Interaction on full model
summary(interact.full <- update(m4, "~ (.)*treatment"))
## Error in update(m4, "~ (.)*treatment"): object 'm4' not found
summary(stepwise.full <- step(interact.full))
## Error in terms(object): object 'interact.full' not found
models <- list(m5, race, tour, rese, interact, stepwise, interact.full, 
    stepwise.full)
## Error in eval(expr, envir, enclos): object 'm5' not found
stargazer(models, type = "text", digits = 3)
## Error in .summary.object$r.squared: $ operator is invalid for atomic vectors
par(mfrow = c(3, 3))
sapply(models, summarize.fit)
## Error in UseMethod("predict"): no applicable method for 'predict' applied to an object of class "NULL"
# This identification result holds under the assumption that
# the skill distribution is logistic. Next, we relax this
# distributional assumption. First, we check different
# distributions [e.g., probit, cauchit, cloglog]. We find
# Probit model seems slightly better in terms of deviance
probit <- update(m5, family = binomial(probit))
## Error in update(m5, family = binomial(probit)): object 'm5' not found
cauchit <- update(m5, family = binomial(cauchit))
## Error in update(m5, family = binomial(cauchit)): object 'm5' not found
cloglog <- update(m5, family = binomial(cloglog))
## Error in update(m5, family = binomial(cloglog)): object 'm5' not found
fit <- list(logit = m5, probit = probit, cauchit = cauchit, cloglog = cloglog)
## Error in eval(expr, envir, enclos): object 'm5' not found
cbind(deviance = sapply(fit, deviance), n = sapply(fit, function(x) length(x$y)))
## Error in lapply(X = X, FUN = FUN, ...): object 'fit' not found
# ... but not necessarily in terms of prediction accuracy
accuracy(probit)
## Error in predict(fit, type = "response"): object 'probit' not found
accuracy(m5)
## Error in predict(fit, type = "response"): object 'm5' not found

We try generalized additive models

# GAMs Generalized additive models
m.gam <- mgcv::gam(submit ~ treatment + s(hours.imp) + s(rating.100imp), 
    data = dat.imp, family = binomial)
## Error in is.data.frame(data): object 'dat.imp' not found
summary(m.gam)
## Error in summary(m.gam): object 'm.gam' not found
plot(m.gam)
## Error in plot(m.gam): object 'm.gam' not found

And non-parametric models like KS and Ichimura.

require(np)

# Ichimura
m.np <- npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp + 
    treatment, data = dat.imp)
## Error in npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp + treatment, : object 'dat.imp' not found
summary(m.np)
## Error in summary(m.np): object 'm.np' not found
plot(m.np)
## Error in plot(m.np): object 'm.np' not found
# Klein-spady ... Not sure about this implementation!
kleinspad <- npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp + 
    treatment, data = dat.imp, method = "kleinspady")
## Error in npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp + treatment, : object 'dat.imp' not found
summary(kleinspad)
## Error in summary(kleinspad): object 'kleinspad' not found
plot(kleinspad)
## Error in plot(kleinspad): object 'kleinspad' not found
# Bayesian models model <- 'submit ~ treatment + tothours.imp
# + mm_rating.100 + educ+gender+timezone' bayes <-
# arm::bayesglm(model, family=binomial) summary(bayes)

Scores

The distribution of final scores shows a few values much smaller than the rest of data points. These outliers come from submissions that either failed to compile while testing, thus returning a null value, or returned extremely small values for some other reason. As submissions could not be much smaller than the baseline…

plot.scores.time <- function(...) {
    scores <- scores[order(scores$submission), ]  # Order scores by time
    index <- tapply(1:nrow(scores), scores$coder_id, tail, 1)  # last submission
    scores <- scores[index, ]
    scores <- scores[order(scores$timestamp), ]  # order by time
    scores$final[is.na(scores$final)] <- 0.792867  # Impute 2 missing scores
    plot(final/1e+06 ~ timestamp, data = scores, type = "l", 
        ...)
    abline(h = 0.792867, lty = 3)  # Baseline
    abline(h = 0.817866, lty = 2, col = 2)  # Target
}
plot.scores.time(col = "lightgray", xlab = "Time of submission (days)", 
    ylab = "Final score")
Scores over time

Scores over time

We drop these outliers in regression, which is somewhat equivalent to using one-way winsorized group means in regression.

# https://en.wikipedia.org/wiki/Winsorized_mean cap.scores <-
# function(x, threshold) ifelse(x<threshold, threshold, x)
# cap.01 <- quantile(scores$final, na.rm=TRUE, p=0.1)
# scores$final.cap <- cap.scores(scores$final,
# threshold=cap.01) scores$final.cap2 <-
# cap.scores(scores$final, threshold=792867) # Baseline

Montecarlo simulations

require(contest)

elastic <- c(-1, 2, -2)
n <- 10

# Study tournament
x.100 <- seq(0.01,0.99, length=100)
t4 <- contest(x=x.100, n=n, type='tournament', deadline=1, target=0.2, elasticity=elastic)

par(mfrow=c(3,3))
plot(t4, pch=16, cex=0.5)

# Create a large matrix of abilities
n <- 50
a <- matrix(runif(100*n), 100, n)

# Simulations 
ystar <- apply(a, 1, function(x) max(contest(x, n=10, type="tournament", deadline=1, target=0.5)$score))
tstar <- apply(a, 1, function(x) max(contest(x, n=10, type="tournament", deadline=1, target=0.5)$timing))
mean(ystar) - 0.5 * mean(tstar)

# Optimize target in a tournament
tournament <- function(x, y) contest(x, n=length(x), target=y, type="tournament")$score
h <- apply(a, 1, function(x) max(tournament(x, 0.1)))

f <- function(x, data) {
    ystar <- apply(data, 1, function(x)
                max()
    tstar <- apply(data, 1, function(x)
                max(contest(x, n=ncol(data), type="tournament", target=x)$timing))
    - mean(ystar) + mean(tstar)
}

optimize(f, interval=c(0, 1), n=10) # Optimal target 0.5716364

# Optimize number of competitors in a tournament
f <- function(x, R=99, elastic=c(-1, 2, -2)) {
    n <- as.integer(x)
    rp <- replicate(R, max(contest(x=runif(n), n=n, type='tournament', deadline=1, target=0.5)$score))
    - mean(rp)
}
optimize(f, interval=c(2, 50)) # Optimal number of competitors

# Optimize with concerns for timing 
f <- function(x, R=99, elastic=c(-1, 2, -2)) {
    n <- as.integer(x)
    rp <- replicate(R, max(contest(x=runif(n), n=n, type='tournament', deadline=1, target=0.5)$score))
    - mean(rp) + mean()
}
optimize(f, interval=c(2, 50)) # Optimal number of competitors

Panel data

data(scores)

# Create panel
create.panel <- function(z, id) {
    diffmin <- function(x) x - min(x)
    z$date <- z$timestamp %>% as.Date %>% as.numeric %>% diffmin
    z$y <- 1
    g <- expand.grid(date = seq(min(z$date), max(z$date)), coder_id = id, 
        y = 0)
    g2 <- rbind(g, z[, c("date", "coder_id", "y")])
    g2$date <- ifelse(g2$date < 3, 1, ifelse(g2$date < 5, 2, 
        ifelse(g2$date < 7, 3, 4)))
    aggregate(y ~ date + coder_id, data = g2, FUN = sum)
}

scores.panel0 <- create.panel(scores, races$coder_id)
scores.panel <- merge(scores.panel0, races, by = "coder_id")


# Unconditional semi-parametric [Odds(first) = exp(alpha +
# beta * t )]
scores.panel$submit <- scores.panel$y > 0
clog <- clogit(submit ~ treatment + strata(date2), data = scores.panel)
## Error in strata(date2): object 'date2' not found
summary(clog)
## Error in summary(clog): object 'clog' not found
# Logistic
m.log <- glm(submit ~ treatment + factor(date), binomial(logit), 
    data = scores.panel)
summary(m.log)
## 
## Call:
## glm(formula = submit ~ treatment + factor(date), family = binomial(logit), 
##     data = scores.panel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6938  -0.6138  -0.5635  -0.5287   2.0297  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.78616    0.19861  -8.994   <2e-16 ***
## treatmenttournament  0.13586    0.19251   0.706    0.480    
## treatmentreserve     0.02642    0.19592   0.135    0.893    
## factor(date)2       -0.13734    0.23461  -0.585    0.558    
## factor(date)3        0.07646    0.22583   0.339    0.735    
## factor(date)4        0.34873    0.21687   1.608    0.108    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1057.1  on 1195  degrees of freedom
## Residual deviance: 1051.3  on 1190  degrees of freedom
## AIC: 1063.3
## 
## Number of Fisher Scoring iterations: 4
## Add rating
clog2 <- clogit(submit ~ treatment + I(mm_rating/100) + strata(date), 
    data = scores.panel)
## Error in unique(c("AsIs", oldClass(x))): object 'mm_rating' not found
summary(clog2)
## Error in summary(clog2): object 'clog2' not found
# Add week hours
hours <- 0
for (i in 1:4) {
    w <- with(scores.panel, switch(i, `1` = week1, `2` = week2, 
        `3` = week3, `4` = week4))
    index <- scores.panel$date == i
    hours[index] <- w[index]
}
clog3 <- clogit(submit ~ treatment + hours + strata(date), data = scores.panel)
summary(clog3)
## Call:
## coxph(formula = Surv(rep(1, 1196L), submit) ~ treatment + hours + 
##     strata(date), data = scores.panel, method = "exact")
## 
##   n= 1108, number of events= 179 
##    (88 observations deleted due to missingness)
## 
##                         coef exp(coef) se(coef)     z Pr(>|z|)   
## treatmenttournament 0.062484  1.064477 0.202002 0.309  0.75708   
## treatmentreserve    0.106070  1.111899 0.201006 0.528  0.59771   
## hours               0.008063  1.008096 0.002791 2.889  0.00386 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## treatmenttournament     1.064     0.9394    0.7165     1.582
## treatmentreserve        1.112     0.8994    0.7498     1.649
## hours                   1.008     0.9920    1.0026     1.014
## 
## Rsquare= 0.007   (max possible= 0.577 )
## Likelihood ratio test= 7.97  on 3 df,   p=0.04671
## Wald test            = 8.56  on 3 df,   p=0.03582
## Score (logrank) test = 8.94  on 3 df,   p=0.03012
clog4 <- clogit(submit ~ treatment + I(mm_rating/100) + hours + 
    strata(date), data = scores.panel)
## Error in unique(c("AsIs", oldClass(x))): object 'mm_rating' not found
summary(clog4)
## Error in summary(clog4): object 'clog4' not found
# Compare models
models <- list(clog, clog2, clog3, clog4, m.log)
## Error in eval(expr, envir, enclos): object 'clog' not found
stargazer(models, type = "text")
## Error in .summary.object$r.squared: $ operator is invalid for atomic vectors

We transform data in a panel with the date of the first submission. The probability of entry in a given date is modeled with a conditional logit. We find that the probability of entry in a given date decreases over time. (if it was at random 1/8 chances of having a submission in a given date). The estimated drop is about 13 percent from one date to the next (a reduction of above 100% from the first to the last date). Using interactions with treatment dummies, we find that the decrease in the probability of entry is negative in all treatments but it is about 25 percent in the race (with pvalue<0.05) it is only 8 (pval) and 6 (pval) percent for tournaments and reserve, respectively.

# Create variable
dmin <- function(x) x - min(x)
scores$date <- scores$timestamp %>% as.Date %>% as.numeric %>% 
    dmin

create.panel <- function(data) {
    data$submit_first <- 1
    g <- with(data, expand.grid(date = seq(min(date), max(date)), 
        coder_id = unique(coder_id), submit_first = 0))
    g2 <- rbind(g, data[, c("date", "coder_id", "submit_first")])
    aggregate(submit_first ~ date + coder_id, g2, FUN = sum)
}

scores.panel <- create.panel(subset(scores, submission == 1))
z <- merge(scores.panel, races, by = "coder_id")


# Unconditional semi-parametric [Odds(first) = exp(alpha +
# beta * t )]
fit <- rep()
fit$clog <- clogit(submit_first ~ date, data = z)
summary(fit$clog)
## Call:
## coxph(formula = Surv(rep(1, 774L), submit_first) ~ date, data = z, 
##     method = "exact")
## 
##   n= 774, number of events= 86 
## 
##         coef exp(coef) se(coef)      z Pr(>|z|)   
## date -0.1510    0.8598   0.0461 -3.275  0.00106 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## date    0.8598      1.163    0.7855    0.9412
## 
## Rsquare= 0.014   (max possible= 0.498 )
## Likelihood ratio test= 11.17  on 1 df,   p=0.0008303
## Wald test            = 10.73  on 1 df,   p=0.001055
## Score (logrank) test = 11.02  on 1 df,   p=0.0008998
# Unconditional parametric [Odds(first) = exp(alpha + beta *
# t )]
fit$logi <- glm(submit_first ~ date, data = z, binomial(logit))
summary(fit$logi)
## 
## Call:
## glm(formula = submit_first ~ date, family = binomial(logit), 
##     data = z)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6252  -0.5441  -0.4394  -0.3801   2.3687  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.53318    0.18853  -8.132 4.21e-16 ***
## date        -0.15121    0.04614  -3.277  0.00105 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 539.99  on 773  degrees of freedom
## Residual deviance: 528.81  on 772  degrees of freedom
## AIC: 532.81
## 
## Number of Fisher Scoring iterations: 5
# Unconditional with interaction
fit$logi2 <- glm(submit_first ~ date + date:treatment, data = z, 
    binomial(logit))
summary(fit$logi2)
## 
## Call:
## glm(formula = submit_first ~ date + date:treatment, family = binomial(logit), 
##     data = z)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6267  -0.5273  -0.4601  -0.3801   2.5737  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.52808    0.18858  -8.103 5.36e-16 ***
## date                     -0.21835    0.07201  -3.032  0.00243 ** 
## date:treatmenttournament  0.08546    0.07472   1.144  0.25273    
## date:treatmentreserve     0.09344    0.07689   1.215  0.22423    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 539.99  on 773  degrees of freedom
## Residual deviance: 526.96  on 770  degrees of freedom
## AIC: 534.96
## 
## Number of Fisher Scoring iterations: 5
# Note the difference when the intercept is not there

# Conditional [Odds(first) = exp(alpha_i + beta * t )]
fit$coclog <- clogit(submit_first ~ date + strata(coder_id), 
    data = z)
summary(fit$coclog)
## Call:
## coxph(formula = Surv(rep(1, 774L), submit_first) ~ date + strata(coder_id), 
##     data = z, method = "exact")
## 
##   n= 774, number of events= 86 
## 
##         coef exp(coef) se(coef)      z Pr(>|z|)   
## date -0.1340    0.8746   0.0433 -3.095  0.00197 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## date    0.8746      1.143    0.8034    0.9521
## 
## Rsquare= 0.013   (max possible= 0.386 )
## Likelihood ratio test= 9.93  on 1 df,   p=0.001627
## Wald test            = 9.58  on 1 df,   p=0.00197
## Score (logrank) test = 9.81  on 1 df,   p=0.001735
# Interactions with treatment dummies [Odds(first) =
# exp(alpha_i + beta_i * t )]
fit$coclog2 <- clogit(submit_first ~ date:treatment + strata(coder_id), 
    data = z)
summary(fit$coclog2)
## Call:
## coxph(formula = Surv(rep(1, 774L), submit_first) ~ date:treatment + 
##     strata(coder_id), data = z, method = "exact")
## 
##   n= 774, number of events= 86 
## 
##                              coef exp(coef) se(coef)      z Pr(>|z|)   
## date:treatmentrace       -0.27991   0.75585  0.08801 -3.180  0.00147 **
## date:treatmenttournament -0.08726   0.91644  0.06847 -1.274  0.20253   
## date:treatmentreserve    -0.06708   0.93512  0.07522 -0.892  0.37256   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                          exp(coef) exp(-coef) lower .95 upper .95
## date:treatmentrace          0.7559      1.323    0.6361    0.8982
## date:treatmenttournament    0.9164      1.091    0.8013    1.0481
## date:treatmentreserve       0.9351      1.069    0.8069    1.0837
## 
## Rsquare= 0.018   (max possible= 0.386 )
## Likelihood ratio test= 14.16  on 3 df,   p=0.002689
## Wald test            = 12.53  on 3 df,   p=0.005761
## Score (logrank) test = 13.61  on 3 df,   p=0.003487
# Note: the above model is the same as clogit(submit_first ~
# date + date:treatment + strata(coder_id), data=z)

# Coefficients
sapply(fit, coef)
## $clog
##       date 
## -0.1510078 
## 
## $logi
## (Intercept)        date 
##  -1.5331826  -0.1512076 
## 
## $logi2
##              (Intercept)                     date date:treatmenttournament 
##              -1.52808481              -0.21835013               0.08545552 
##    date:treatmentreserve 
##               0.09344282 
## 
## $coclog
##       date 
## -0.1339918 
## 
## $coclog2
##       date:treatmentrace date:treatmenttournament    date:treatmentreserve 
##              -0.27990753              -0.08725850              -0.06707555
# Days as factors


# # Compute first submission time time.max <- '2015-03-16
# 11:59:59 EDT' time.min <- '2015-03-08 12:00:00 EDT'
# time.censored <- 192 fsutime <-
# as.numeric(difftime(races.sub$timestamp, time.min ,
# unit='hours')) dat <- with(races.sub, aggregate(fsutime ~
# handle, FUN=min)) dat$fsu <- 1 dat2 <- merge(dat, races,
# by='handle', all=TRUE) dat2$fsu[is.na(dat2$fsu)] <- 0
# dat2$fsutime[is.na(dat2$fsutime)] <- time.censored surv <-
# rep() surv$m1 <- survreg(Surv(fsutime, fsu) ~ 1, dat2,
# dist='weibull', scale=1) surv$m2 <- survreg(Surv(fsutime,
# fsu) ~ treatment + mmevents, dat2 , dist='weibull',
# scale=1) surv$m3 <- survreg(Surv(fsutime, fsu) ~ treatment
# + mmevents, dat2, dist='exponential') surv$tobin <-
# survreg(Surv(fsutime, fsu, type='left') ~ treatment +
# mmevents, data=dat2, dist='gaussian') stargazer(surv,
# type='text') # A model with different baseline survival
# shapes for two groups, i.e., # two different scale
# parameters survreg(Surv(time, status) ~ ph.ecog + age +
# strata(sex), lung) # There are multiple ways to
# parameterize a Weibull distribution. The survreg # function
# embeds it in a general location-scale family, which is a #
# different parameterization than the rweibull function, and
# often leads # to confusion.  # survreg's scale =
# 1/(rweibull shape) # survreg's intercept = log(rweibull
# scale) # For the log-likelihood all parameterizations lead
# to the same value.  y <- rweibull(1000, shape=2, scale=5)
# survreg(Surv(y)~1, dist='weibull') # Economists fit a model
# called `tobit regression', which is a standard # linear
# regression with Gaussian errors, and left censored data.
# tobinfit <- survreg(Surv(durable, durable>0, type='left') ~
# age + quant, data=tobin, dist='gaussian')

Model estimation

  • We observe \(Y_1, ..., Y_L\) participants for \((l=1, ..., L)\) rooms
  • Each room has \(n_l\) registered competitors and room characteristcs \(X_l\) (deadline, target, etc.).
  • We assume that \(Y_l\) is distributed as a binomial variable with parameters \(n_l\) and \(p(x_l, \theta)\) where \(p(x, \theta)\) describes the probability for a competitor to participate in a room described by the vector \(x\):
\[\begin{equation} \label{model} Y_{l} \sim \text{Binomial}(N_l, p(x_l, \theta)) \quad \text{for each } l=1,...,L. \end{equation}\]
  • In binomial linear models, the probability function \(p(x, \theta)\) is related to a linear prediction through a one-to-one transformation called the link function (see XXXX). [Let see if our model is linear]

  • Competitors have a latent ability variable \(a^*\) drawn from a common distribution \(F_l(\cdot)\) (distributions may differ across rooms).

  • At equilibrium, \(Y_{l} = 1 \iff a^* \geq a_{0l}\) where \(a_{0l}\) is the marginal type for the room in room \(l\).

  • Zero profit condition (revenues = costs) is

\[ R(a; F_l) = C(a, y_{0,l}, t_{0,l}) \]

  • Given \(F_l(\cdot)\) the distribution of individual abilities in each room, the probability of entry is:
\[\begin{equation} p(x_l, \theta) = 1 - F_l(a_{0l}) \end{equation}\]

First, we rewrite the probability \(p(x, \theta)\) in the following way:

\[ p(x_l, \theta) = 1 - \theta_1 x_l^{\theta_2} \]

where we denote the (observed) target by \(x_l\) and the vector of parameters by \(\theta=(\theta_1, \theta_2)\) where \(\theta_1 \equiv m_l^{\alpha/(n_l-1)}\) and \(\theta_2 \equiv \beta/(n_l-1)\). Assume that \(n_l=n\) for each \(l=1,...,L\).

Let \(Y_1, Y_2, ..., Y_L\) be the count of participants generated by our binomial-distribution model. The likelihood is:

\[ \mathcal{L} = \prod_{l=1}^L \binom{n_l}{y_l} p(x_l, \theta)^{y_l} [1-p(x_l, \theta)]^{n_l - y_l} \]

The log-likelihood is:

\[ ll = \sum_{l=1}^L y_l\log(p(x_l, \theta)) + (n_l - y_l) \log(1-p(x_l, \theta)) \]

where we omit the binomial coefficient as it is a constant that does not affect estimation.

Using our model into the aboev eqution:

\[ ll = \sum_{l=1}^L y_l\log(1 - \theta_1 x_l^{\theta_2}) + (n - y_l) [\log(\theta_1) + \theta_2\log(x_l)] \]

Let the summation be denoted by \(\sum y_l = \bar y\). We can rewrite the ll expression:

\[ ll = (L n - \bar y)\log(\theta_1) + \sum_{l=1}^L y_l\log(1 - \theta_1 x_l^{\theta_2}) + (n - y_l) \theta_2\log(x_l) \]

First order conditions are:

\[ \frac{\partial ll}{\partial \theta_1} = \frac{(L n - \bar y)}{\theta_1} - \sum_{l=1}^L y_l \frac{x_l^{\theta_2}}{1 - \theta_1 x_l^{\theta_2}} = 0 \]

\[ \frac{\partial ll}{\partial \theta_2} = \sum_{l=1}^L \left[- y_l \frac{x_l^{\theta_2}\theta_1 \log(x_l)}{1 - \theta_1 x_l^{\theta_2}} + (n - y_l) \log(x_l) \right]= 0. \]

Try with simulations

n <- 10
nsize <- 199
theta1 <- 0.75
theta2 <- 0.05
params <- c(theta1, theta2)
x <- runif(nsize)
p <- 1 - theta1 * x^theta2
# hist(prob, 100)

loglik <- function(theta, data) {
    theta1 <- theta[1]
    theta2 <- theta[2]
    x <- data$x
    y <- data$y
    n <- data$n
    p <- 1 - theta1 * x^theta2
    out <- dbinom(y, n, p, log = T)
    return(-sum(out))
}

# Simulate data
y <- rbinom(nsize, size = n, prob = p)

# Example: loglik(c(0.5, 0.9), data.frame(y, x, n))

# MLE estimation
dat <- data.frame(y, x, n)
theta.start <- runif(2)
mle.est <- optim(theta.start, loglik, data = dat)
thetahat <- mle.est$par

# Replications
out <- replicate(1000, {
    y <- rbinom(nsize, size = n, prob = p)
    dat <- data.frame(y, x, n)
    mle.est <- optim(theta.start, loglik, data = dat)$par
})
boxplot(t(out))
abline(h = params, col = 2)

truth <- function(x) 1 - params[1] * x^params[2]

# Now I can 'predictions' on what would happen with different
# 'x' distribution
prediction <- function(x) 1 - thetahat[1] * x^thetahat[2]

# Compare with logistic regression
N <- rep(n, nsize)
logistic <- glm(cbind(y, N) ~ x, family = binomial(logit))
prediction.logistic <- function(x) {
    yhat <- coef(logistic)[1] + coef(logistic)[2] * x
    exp(yhat)/(1 + exp(yhat))
}

# FIGURE
curve(truth, xlab = "Room covariate", ylim = c(0, 1))
curve(prediction, add = TRUE, lty = 2)
curve(prediction.logistic(x), add = TRUE, lty = 3)
legend("topright", c("Truth", "Estimated", "Logistic"), lty = 1:3)

Identification (old)

Example with the Uniform distribution

Imagine that the distribution \(F_\theta\) is uniform on \((0, \theta)\) and the cost ability \(c_a(x) = 1/x\). Then the zero profit condition becomes

\[\begin{align} \left(\frac{m}{\theta}\right)^{n-1} m = \beta x_l \end{align}\]

Solving for the marginal type gives:

\[\begin{align} m = \left(\beta x_l \theta^{n-1}\right)^{1/n} \end{align}\]

So, the probability of participation becomes

\[\begin{align} p(x_l, \theta) & = 1 - F_\theta(m) \\ & = 1 - \frac{\left(\beta x_l \theta^{n-1}\right)^{1/n} }{\theta} \\ & = 1 - \left(\frac{\beta x_l}{\theta}\right)^{1/n}. \end{align}\]

Because it is non-linear in its parameters, the model is not identifiable [need proof]. However, we can re-parametrize the model with \(\beta/\theta \equiv \eta\) to make the parameter \(\eta\) identifiable.

Simulated distribution of participants with uniform distribution. Parameters are $\eta=0.1$ and $n=15$. Data simulated 500 times. Dashed curve had higher costs ($x_l=1.5$) than the solid curve ($x_l=0.5$).

Simulated distribution of participants with uniform distribution. Parameters are \(\eta=0.1\) and \(n=15\). Data simulated 500 times. Dashed curve had higher costs (\(x_l=1.5\)) than the solid curve (\(x_l=0.5\)).

Example with Beta distribution

In this example, we assume abilities are drawn from a lognormal distribution with parameters \(\mu\) and \(\sigma\).

Estimation

Our aim is to estimate the parameters \(\theta\) and \(\beta\) and to evaluate whether costs are different between the three competition modes under study: race, tournament, tournament + requirement.

It is natural to estimate the parameters \(\theta\) by maximum likelihood because the ability distribution (and therefore the distribution of the outcomes) is known. The estimation criterion used here is the maximization of the deviance.

The deviance is: \[\begin{equation} D(\theta) = -2 \sum Y \log (\frac{N p(x, \theta)}{Y}) + (N - Y) \log ( \frac{N - N p}{N - Y}). \end{equation}\]

Note that the deviance is a function of the likelihood (see xxx pg. xxx).

# Define likelihood
structreg <- function(x, y, wt = rep(1, length(y)), intercept = TRUE, 
    start = rep(0, p), ...) {
    fmin <- function(beta, X, y, w) {
        p <- 1 - (beta %*% X)^(1/n)  # Function of parameters
        -sum(2 * w * ifelse(y, log(p), log(1 - p)))
    }
    # gmin <- function(beta, X, y, w) { # Gradient here!  }
    if (is.null(dim(x))) 
        dim(x) <- c(length(x), 1)
    dn <- dimnames(x)[[2]]
    if (!length(dn)) 
        dn <- paste("Var", 1:ncol(x), sep = "")
    p <- ncol(x) + intercept
    if (intercept) {
        x <- cbind(1, x)
        dn <- c("(Intercept)", dn)
    }
    if (is.factor(y)) 
        y <- (unclass(y) != 1)
    # fit <- optim(start, fmin, gmin, X = x, y = y, w = wt,
    # method = 'BFGS', ...)
    fit <- optim(0.5, fmin, X = x, y = y, w = wt, method = "BFGS", 
        ...)
    names(fit$par) <- dn
    cat("\nCoefficients:\n")
    print(fit$par)
    cat("\nResidual Deviance:", format(fit$value), "\n")
    cat("\nConvergence message:", fit$convergence, "\n")
    invisible(fit)
}

# Create data
ilogit <- function(x) exp(x)/(1 + exp(x))
n <- 50000
competitors <- 5
x1 <- rchisq(competitors, df = 3)
x2 <- runif(n)
x3 <- rnorm(n)
X <- cbind(rep(1, length(x1)), x1, x2, x3)
betas <- c(-2.5, 0.1, 0.25, 0.5)
fc <- ilogit(X %*% betas)  ## This the fixed cost
p <- 1 - fc^(1/competitors)
y <- rbinom(n = n, size = competitors, prob = p)

# Objective function
fmin <- function(beta, X, y, w, competitors) {
    p <- 1 - ilogit(X %*% beta)^(1/competitors)  # Function of parameters
    -sum(2 * w * ifelse(y, log(p), log(1 - p)))
}

b.start <- runif(length(betas))
(out <- optim(b.start, fmin, X = X, y = y, competitors = competitors, 
    w = rep(1, length(y)))$par)
## [1] -12.8788774   0.4197166   0.9890749   2.1442844
rbind(out/competitors, betas)
##            [,1]       [,2]     [,3]      [,4]
##       -2.575775 0.08394333 0.197815 0.4288569
## betas -2.500000 0.10000000 0.250000 0.5000000

More examples

Suppose F is lognormal(\(\mu, \sigma\)), then the zero profit condition is:

\[\begin{align} \left[\frac 12 +\frac 12 \left(\frac{\log{m}-\mu}{\sqrt{2\sigma}}\right)\right] = a^{-1/(n-1)} K_l \end{align}\]
# xxx
zeroprofit <- function(x, mu, sigma, n) {
    x * plnorm(x, meanlog=mu, sdlog=sigma)^(n-1)
}
for (i in 3:10)
    curve(zeroprofit(x, i, 10, 10), add=TRUE, lty=i)
# Clear
rm(list = ls())

# Libraries
require(xtable)
require(moments)
require(races)

# Data
data(races)
attach(races)

# TABLE 1.
size <- ave(submit, paste(treatment, room), FUN = length)
(m <- aggregate(submit ~ room + size + treatment, FUN = sum))
##    room size  treatment submit
## 1     1    9       race      2
## 2     2   10       race      2
## 3     3   10       race      5
## 4     4   10       race      1
## 5     5   15       race      4
## 6     6   15       race      3
## 7     7   15       race      4
## 8     8   15       race      5
## 9     1   10 tournament      3
## 10    2   10 tournament      5
## 11    3   10 tournament      5
## 12    4   10 tournament      2
## 13    5   15 tournament      5
## 14    6   15 tournament      4
## 15    7   15 tournament      4
## 16    8   15 tournament      5
## 17    1   10    reserve      1
## 18    2   10    reserve      3
## 19    3   10    reserve      3
## 20    4   10    reserve      2
## 21    5   15    reserve      4
## 22    6   15    reserve      6
## 23    7   15    reserve      3
## 24    8   15    reserve      5

The model

We now generalize the contest game introduced by @moldovanu2001optimal to a situation where players simultaneously decide \(i)\) the quality and \(ii)\) how fast to produce a given output. Then we explore the problem of revenue maximization faced by a contest designer with preferences for both quality and time.

Basic setup

Equilibrium

In this section, we solve the model for the unique symmetric Bayesian Nash equilibrium of players.

Tournament

At equilibrium, players choose \(\quality\) and \(t_i\) by maximizing given their beliefs about the equilibrium actions of the other players.

The key to characterization of the equilibrium is that \(t_i=\deadline\) is a (weakly) dominant strategy for each player. Indeed, any time strictly below the deadline does not affect the probability of winning but is costly in terms of effort, and any time strictly above the deadline gives a negative payoff.

Since players should avoid playing weakly dominated strategies, we have that \(t_i=\deadline\) for every \(i=1,...,n\). Then it can be shown that, at equilibrium, the optimal quality is a one-to-one transformation of a player’s ability according to some “bidding” function \(b(\cdot)\). Hence, \(Y_i = b(A_i)\). Since the distribution of \(A_i\) is known, one can use a change of variables formula to express the probability of winning in in terms of the distribution \(F\). Then, the first order condition of \(\pi\) with respect to quality gives the following differential equation in \(\ability\) (for every \(i=1,...,n\)):

\[ \sum_{k=1}^{q} \hat p^\prime_{k}(\phi(y_i)\mid \tournament) \phi^\prime(y_i) v_k = \beta y_i^{\beta-1} a^\alpha t^\gamma \]

with the boundary condition \(y(\lotype)=0\).

At equilibrium we have \(a=\phi(y)\). Thus, the differential equation is separable. After rearranging, we have:

\[ \sum_{k=1}^{q}\phi(y_i)^{-\alpha}\hat p^\prime_{k}(\phi(y_i)\mid \tournament)\phi^\prime(y_i) v_k = \beta y_i^{\beta-1} t^\gamma. \]

Integration of both sides with respect to \(\phi\) gives:

\[ \int_{0}^{y} \sum_{k=1}^{q} t^{-\gamma} v_k \phi(x)^{-\alpha}\hat p^\prime_{k}(\phi(x)\mid \tournament)\phi^\prime(x) dx = \int_{0}^{y} \beta x^{\beta-1} dx \]

Using the chain rule of derivatives

\[ \int_a^b f(g(x)) g^\prime(x) dx = \int_{g(a)}^{g(b)} f(x) dx \qquad \text{(chain rule)} \]

the solution is

\[\begin{equation} \label{optimal quality tournament} y(a_i) = \left[\deadline^{-\gamma} \sum_{k=1}^q v_k \int x^{-\alpha} \hat p^\prime_k(x)dx \right]^{1/\beta} \end{equation}\]

An important property of is that \(y(a_i)\) has its lower bound in zero and its upper bound in

\[ y(\hitype) = \left[\deadline^{-\gamma} v_1 \left( \frac{\hitype^{-\alpha+1} - \lotype^{-\alpha+1}}{-\alpha + 1} \right)\right]^{1/\beta}. \]

  • Monotonicity of the equilibrium output quality implies that, for every \(i=1, ..., n\), the equilibrium expected payoff from the contest \(\pi_i^*\) depends on the rank of the player’s ability relative to the others.

  • Equilibrium payoffs do not depend on the elasticity \(\beta\) or \(\gamma\), but only on \(\alpha\).
  • When abilities are distributed over the unit interval, the equilibrium payoff for the highest ability player is \(\pi(\hitype=1) = v_1 (-\alpha)/ (1-\alpha)\).

Equilibrium in a race

In a similar way, one can derive the equilibrium strategy in a race. Again the key observation is that any quality below the target gives a zero probability of winning and any quality above the target gives a constant probability of winning. Thus, player \(i\)’s choice of quality is either zero (with \(t_i=\deadline\) by convention) or \(y_i=\target\).

Then, the equilibrium xxx for player \(i\) is \[\begin{equation} \label{tstar} t^*(a_i) = \ctime^{-1} \left[\ctime(\deadline) + \frac{1}{\cscore(\target)} \left(\alpha \int_{a_i}^{\hitype} A^\prime(z) dz + (1-\alpha) \int_{a_i}^{\hitype} B^\prime(z) dz \right) \right] \end{equation}\] where \[\begin{equation} A(x) = \frac{1}{c_{a}(x)} f_{(n-1:n-1)}(x) \end{equation}\] and \[\begin{equation} B(x) = \frac{1}{c_{a}(x)} \left\{ \left[1- F_{(n-1:n-1)}(x)\right]f_{(n-1:n-2)}(x) + f_{(n-1:n-1)}(x) F_{(n-1:n-2)}(x) \right\}. \end{equation}\] An important property of XX is that \(y^*(a_i)\) has its upper bound in XX and lower bound in XX. Again payoffs are xxxx. Hence, by setting to zero and solving for the ability, gives the marginal ability \({\underline a}\) as \[\begin{equation} {\underline a}= h(n, V, F_A, C, d). \end{equation}\]

Tournament vs races

By comparing equilibrium xxx and xxx, we find that the race and the tournament do not (ex-post) dominate one another with respect to output quality. Whereas the race always dominates the tournament with respect to completion time. [This is only when the deadline is the same. Otherwise, there’s always xxxx.] This result is stated below.

Let’s make an example.

p <- plnorm   # pdf individual abilities 
r <- rlnorm   # Simulate individual abilities
cy <- function(x) x^2 # Cost function performance
ct <- function(x) 2*exp(1-x)  # Cost function timing 

FIGURE 1. Equilibrium bids in a race and a tournament.

Implications. The above proposition applies only if the target is higher in a race than in a tournament. But what if the two competitions had the same target ? In that case, tournaments and races have the same marginal type. Therefore, the performance of players in the tournament with reserve are always non-lower than those in the race. This does not imply that it is optimal to set the target. On the contrary, we will show that it is optimal to set an optimal target in a tournament that is below the optimal target in a race. Next section.

The contest designer’s problem

Let us now focus on the contest designer’s problem. Imagine the contest designer can choose the competition format to be either the race or the tournament. Imagine all other aspects of design are given. The prize structure \(\alpha\) has been already chosen. There is a deadline \(\deadline\), which is the same in both competition formats. [The quality requirement \(\target^c\) in the tournament is smalle than that in the race \(\target^\race > \target^\tournament\))] We will relax this assumption later to consider a more general setting where these variables are also part of the contest designer’s problem.

The contest designer has an objective function that is increasing in the expected quality of the winning solution and decreasing in the corresponding time to completion. Here, to do not complicate exposition, we assume that the contest designer cares about the winning submission only: second placed efforts are not considered. [If the principal values the diversity of the solutions … but we assume it does not.]

XXX EQUATION XXXX

The optimal choice involves a comparison of the expected profits between the race and the tournament. Given xxxx, we can show that there will be a threshold on the cost of completion time \(\hat\tau\) above which the race is a better choice than the tournament, and vice versa.

Proof. In a tournament, the objective function is \[\begin{align} R_\tournament & = \Pr(t_{(1:n)}\leq \deadline) \left\{\int y^*(x \mid t_{(1:n)}\leq \deadline) dF_{n:n}(x) - \tau \deadline - 1 \right\} \nonumber\\ & = \int_{\mtype}^{\hitype} y^*(x) dF_{n:n}(x) - \tau \deadline - 1. \end{align}\]

That is, the contest designer’s objective function is the sum of the expected output quality for a given deadline, minus the cost \(\tau\) of having the winner working on the task until completion (i.e., until the deadline), and the cost of the prize pool (recall the prize pool is normalized to one).

[Implicitly, you’re assuming that the prize is always large enough to ensure positive effort.] [Second prize too is stochastic!!!!]

In a race, the objective function is \[\begin{align} R_\race & = \Pr(a_{(N)}\geq \mtype) \left\{\target - \alpha - \Pr(a_{(N-1)}\geq \mtype) (1-\alpha) \right\} - \tau \int_{\mtype}^{\infty} t^*(x) dF_{N:N}(x) \nonumber\\ & = [1-F_{N:N}(\mtype)] \left\{\target - \alpha - [1-F_{N-1:N}(\mtype)] (1 - \alpha) \right\} - \tau \int_{\mtype}^{\infty} t^*(x) dF_{N:N}(x). \end{align}\] Note. \(t^*(x) \leq \deadline\) for all \(x\)’s. Thus, a lower bound for the above objective function can be computed: \[\begin{align} \underline {R_\race} & = [1-F_{N:N}(\mtype)] \left\{\target - \alpha - [1-F_{N-1:N}(\mtype)] (1 - \alpha) - \tau \deadline\right\} \end{align}\]

An even simpler lower bound is rewriting the above expression as if \(\alpha=1\) (note if the real alpha was set 1 then also mtype would change and therefore setting alpha hits a lower bound only when mtype does xxxx when alpha is 1).

Note. \(y^*(x)\) is lower than \(\target\) for all \(a < \mtype\). Thus, a lower bound of the tournament’s expression is \[\begin{align} \overline {R_\tournament} & = [1-F_{N:N}(\mtype)] \target + \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) - \tau \deadline - 1. \end{align}\] \[\begin{align} \underline {R_\race} \geq & \overline {R_\tournament} \nonumber\\ [1-F_{N:N}(\mtype)] (\target - 1 - \tau \deadline) \geq & [1-F_{N:N}(\mtype)] \target + \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) - \tau \deadline - 1 \nonumber\\ - [1-F_{N:N}(\mtype)] (\tau\deadline + 1) \geq & \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) - (\tau \deadline + 1) \nonumber\\ F_{N:N}(\mtype) (\tau \deadline + 1) \geq & \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) \nonumber\\ \tau \geq & \left[ \frac{\int_{\mtype}^\infty y^*(x) dF_{N:N}(x)}{F_{N:N}(\mtype)} -1 \right] \frac{1}{\deadline} \end{align}\]

End proof.

When the cost of time \(\tau\) is sufficiently high, the race is preferred. Interestingly, the threshold is a function of the deadline to complete the job, as xxx. It also depends on the shape of xxxx.

Optimal minimum-entry

Now we turn to discuss the contest designer’s choice of an optimal minimum requirement \(\target\). So far, we have assumed that \(\target^\race>\target^\tournament\). Now, we show that the assumption that xxxx is indeed an optimal choice of the contest designer. This is summarized in the next proposition.

To prove that it is indeed the case. We proceed in two steps. First, we assume that the contest designer does not care about minimizing the timing of the innovation by imposing \(\tau = 0\). For simplicity, assume that \(\alpha=1\) (winner-takes-all). In a race, this means that the optimal target will be a value that makes equal the costs in terms of less participation versus the gains in terms of higher values of the winning solutions. Formally, the contest designer’s problem in a race is \[\begin{align} \text{maximize } & R^\race = [1-F_{N:N}(\mtype)] (\target^\race - 1). \end{align}\] Note that \(\mtype\) depends on the target. This is clearly concave in \(\target^\race\). Thus, the first order condition is also sufficient. \[\begin{align}\label{foc race} \text{FOC } & \Rightarrow -F^\prime_{N:N}(\mtype) \mtype^\prime (\target^\race - 1) + [1-F_{N:N}(\mtype)] = 0. \end{align}\] In a tournament, … \[\begin{align} \text{maximize } & R^\race = \int_{\mtype}^\infty y^*(x, \target) d F_{N:N}(x) - [1-F_{N:N}(\mtype)]. \end{align}\]

Convexity is not sure. If not, then the optimal target is zero. Which is lower than the optimal target in a race.

Instead. If the objective function is (strictly) concave then there’s an internal solution.

\[\begin{align} \label{foc tournament} \text{FOC } \Rightarrow & \frac{d\int_{\mtype}^\infty y^*(x, \target) d F_{N:N}(x)) }{d \target} + F^\prime_{N:N}(\mtype) \mtype^\prime =0 \nonumber\\ & \text{(by using Leibniz rule)}\nonumber\\ \Rightarrow & - y^*(\mtype, \target) \mtype^\prime F^\prime_{N:N}(\mtype) + \int_{\mtype}^\infty \dystar - F^\prime_{N:N}(\mtype) \mtype^\prime = 0\nonumber\\ \Rightarrow & -\target \mtype^\prime F^\prime_{N:N}(\mtype) + \int_{\mtype}^\infty \dystar - F^\prime_{N:N}(\mtype) \mtype^\prime = 0. \end{align}\] Using with , the optimal target is the same in the race and the tournament only if \[\begin{align} \int_{\mtype}^\infty \dystar = [1- F_{N:N}(\mtype)]. \end{align}\]

\[ \frac{\partial y^*(x, \target)}{\partial \target} = \frac{c_y^\prime(\target)}{c_y^\prime(y^*(x, \target))}. \]

Then.

  • If \(c_y(\cdot)\) is linear, we have that the ratio is one for all \(x\).

  • If \(c_y(\cdot)\) is convex, then we have that it is less than one. If

  • If \(c_y(\cdot)\) is concave, then we have that it is higher than one.

As a result, if linear or convex the first order condition is lower than that in the race. Since the obj. function is concave (second order is decreasing), the target should be lower in a tournament than in a race to satisfy the first order condition. (a lower target increases the focs.).

Conjecture. If \(\tau>0\), the \(\target\) in the race is higher.

Structural econometric model

Readings:

General two-step strategy:

  • First step. Identify the marginal type from the data and the distribution of types.

  • Second step. Using the estimated distribution of types.

Basic idea. Equilibrium condition gives: \[\begin{equation} y_i^* = y^*(a_i; F_{\mathcal{A}}). \end{equation}\]

with \(y^*(\cdot)\) being an invertible function with \(\phi\) denoting the inverse.

Hence the distribution of bids is \[\begin{equation} F_{Y}(y) = \Pr(y_i^* \leq y) = \Pr(y^*(a_i) \leq y) = \Pr(a_i \leq \phi(y)) = F_\mathcal{A}( \phi(y)). \end{equation}\]

Identification of the model. suggest

data(scores) dat <- merge(scores, races[, c(“coder_id”, “treatment”)], by=‘coder_id’) attach(dat)

time_l <- split(timestamp, coder_id)

difftime2 <- function(x) { n <- length(x) init <- as.POSIXct(“2015-03-08 12:00:00 EDT”) y <- x - c(init, x[-n]) units(y) <- “hours” return(as.numeric(y)) }

interval <- unlist(tapply(timestamp, coder_id, FUN=difftime2)) boxplot(interval ~ treatment, outline=FALSE)

difference of location

sapply(interval_l, mean) sapply(interval_l, median)

test difference

interval_l <- split(interval, treatment) kruskal.test(interval_l) # Significant!!!!

Tobit

Theory

In the standard tobit model [@amemiya1985advanced], we have:

\[ y_i = \begin{cases} y_l & \text{if } y^*_i < y_l\\ y^*_i & \text{if } y^*_i \geq y_l\\ \end{cases} \]

Where \(y^* = x\beta + \epsilon\), assuming disturbance \(\epsilon \sim N(0, \sigma^2)\)

Let denote the standard normal distribution by \(\Phi(\cdot)\) and the density function by \(\phi(\cdot)\). The likelihood is:

\[ \mathcal{L} = \prod_{i}^n \left(\sigma^{-1}\phi\left(\frac{y_i - X_i\beta}{\sigma}\right)\right) ^{I(y_i)} \left(1 - \Phi\left(\frac{X_i\beta - y_l}{\sigma}\right)\right) ^{1-I(y_i)} \]

where \(I(\cdot)\) is an indicator function that is 1 when \(y_i > y_l\) and 0 otherwise.

Application

require(AER)

# Example with left censoring
n <- 1000
yl <- 0.1
x1 <- runif(n)
x2 <- rchisq(n, 1)
betas <- c(0.25, -0.1)
simulate.tobit <- function(n, betas, yl, x1, x2) {
    ystar <- rnorm(n, mean = cbind(x1, x2) %*% betas)
    y <- ifelse(ystar < yl, yl, ystar)
    return(y)
}

# Try
y <- simulate.tobit(n, betas, yl = 0.1, x1, x2)
summary(tobit(y ~ x1 + x2 - 1, left = yl))
## 
## Call:
## tobit(formula = y ~ x1 + x2 - 1, left = yl)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           1000            525            475              0 
## 
## Coefficients:
##            Estimate Std. Error z value Pr(>|z|)    
## x1          0.26721    0.06823   3.916 8.99e-05 ***
## x2         -0.09012    0.02347  -3.840 0.000123 ***
## Log(scale) -0.06132    0.03513  -1.746 0.080872 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 0.9405 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 3 
## Log-likelihood:  -992 on 3 Df
## Wald-statistic: 21.03 on 1 Df, p-value: 4.5294e-06
# Simulations
coef.sim <- replicate(499, {
    y <- simulate.tobit(n, betas, yl, x1, x2)
    coef(tobit(y ~ x1 + x2 - 1, left = yl)) - betas
})

# Consistency looks good
boxplot(t(coef.sim), ylim = 0.5 * c(-1, 1))

# Inference looks good
apply(coef.sim, 1, sd)
##         x1         x2 
## 0.07140839 0.02619388